import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import missingno as msno
import collections
import os
from stdnum.ean import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from collections import Counter
import folium
import folium.plugins
import branca
import branca.colormap as cm
import time
sns.set()
building_df = pd.read_csv('2016_Building_Energy_Benchmarking.csv')
initial_nbr_rows = building_df.shape[0]
initial_nbr_columns = building_df.shape[1]
print('The size of the table is', initial_nbr_rows,
'rows, and', initial_nbr_columns, 'columns.')
building_df.sort_values(by='OSEBuildingID', inplace=True)
The size of the table is 3376 rows, and 46 columns.
# To display all columns when needed
pd.set_option('display.max_columns', None)
# Function to add infos about the type and null values to .describe function
def df_description(df):
# General description
description = df.describe(include='all', datetime_is_numeric=True)
# Types and content of the columns
description_type = pd.DataFrame(df.dtypes, columns=['type']).T
number_null = (df.shape[0]-description.loc['count'].T)
percentage_null = (df.shape[0]-description.loc['count'].T)*100/df.shape[0]
description_null = pd.DataFrame([number_null, percentage_null],
index=['number null', 'percentage null'])
# Concatenate general, type and null stats
description = pd.concat([description_type, description_null, description])
return description
print('Percentage of NaN in the dataset equal to',
sum(building_df.isna().sum()/building_df.shape[0]*100)/building_df.shape[1])
df_description(building_df)
Percentage of NaN in the dataset equal to 12.84772305790233
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | Address | City | State | ZipCode | TaxParcelIdentificationNumber | CouncilDistrictCode | Neighborhood | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEUI(kBtu/sf) | SiteEUIWN(kBtu/sf) | SourceEUI(kBtu/sf) | SourceEUIWN(kBtu/sf) | SiteEnergyUse(kBtu) | SiteEnergyUseWN(kBtu) | SteamUse(kBtu) | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | DefaultData | Comments | ComplianceStatus | Outlier | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| type | int64 | int64 | object | object | object | object | object | object | float64 | object | int64 | object | float64 | float64 | int64 | float64 | int64 | int64 | int64 | int64 | object | object | float64 | object | float64 | object | float64 | object | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | bool | float64 | object | object | float64 | float64 |
| number null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 16.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 20.0 | 20.0 | 1697.0 | 1697.0 | 2780.0 | 2780.0 | 3257.0 | 843.0 | 7.0 | 6.0 | 9.0 | 9.0 | 5.0 | 6.0 | 9.0 | 9.0 | 9.0 | 9.0 | 9.0 | 0.0 | 3376.0 | 0.0 | 3344.0 | 9.0 | 9.0 |
| percentage null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.473934 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.236967 | 0.0 | 0.0 | 0.0 | 0.0 | 0.266588 | 0.592417 | 0.592417 | 50.266588 | 50.266588 | 82.345972 | 82.345972 | 96.475118 | 24.970379 | 0.207346 | 0.177725 | 0.266588 | 0.266588 | 0.148104 | 0.177725 | 0.266588 | 0.266588 | 0.266588 | 0.266588 | 0.266588 | 0.0 | 100.0 | 0.0 | 99.052133 | 0.266588 | 0.266588 |
| count | 3376.0 | 3376.0 | 3376 | 3376 | 3376 | 3376 | 3376 | 3376 | 3360.0 | 3376 | 3376.0 | 3376 | 3376.0 | 3376.0 | 3376.0 | 3368.0 | 3376.0 | 3376.0 | 3376.0 | 3376.0 | 3367 | 3356 | 3356.0 | 1679 | 1679.0 | 596 | 596.0 | 119 | 2533.0 | 3369.0 | 3370.0 | 3367.0 | 3367.0 | 3371.0 | 3370.0 | 3367.0 | 3367.0 | 3367.0 | 3367.0 | 3367.0 | 3376 | 0.0 | 3376 | 32 | 3367.0 | 3367.0 |
| unique | NaN | NaN | 8 | 24 | 3362 | 3354 | 1 | 1 | NaN | 3268 | NaN | 19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 466 | 56 | NaN | 50 | NaN | 44 | NaN | 65 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | NaN | 4 | 2 | NaN | NaN |
| top | NaN | NaN | NonResidential | Low-Rise Multifamily | Northgate Plaza | 2203 Airport Way S | Seattle | WA | NaN | 1625049001 | NaN | DOWNTOWN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Multifamily Housing | Multifamily Housing | NaN | Parking | NaN | Retail Store | NaN | 2016 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | Compliant | Low outlier | NaN | NaN |
| freq | NaN | NaN | 1460 | 987 | 3 | 4 | 3376 | 3376 | NaN | 8 | NaN | 573 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 866 | 1667 | NaN | 976 | NaN | 110 | NaN | 14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3263 | NaN | 3211 | 23 | NaN | NaN |
| mean | 21208.991114 | 2016.0 | NaN | NaN | NaN | NaN | NaN | NaN | 98116.949107 | NaN | 4.439277 | NaN | 47.624033 | -122.334795 | 1968.573164 | 1.106888 | 4.709123 | 94833.537322 | 8001.526066 | 86832.011256 | NaN | NaN | 79177.638558 | NaN | 28444.075817 | NaN | 11738.675166 | NaN | 67.918674 | 54.732116 | 57.033798 | 134.232848 | 137.783932 | 5403667.294533 | 5276725.714395 | 274595.898209 | 1086638.966571 | 3707612.161594 | 13685.045376 | 1368504.541443 | NaN | NaN | NaN | NaN | 119.723971 | 1.175916 |
| std | 12223.757015 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 18.615205 | NaN | 2.120625 | NaN | 0.047758 | 0.027203 | 33.088156 | 2.108402 | 5.494465 | 218837.60712 | 32326.723928 | 207939.811923 | NaN | NaN | 201703.407492 | NaN | 54392.917928 | NaN | 29331.199286 | NaN | 26.873271 | 56.273124 | 57.16333 | 139.287554 | 139.109807 | 21610628.627639 | 15938786.484121 | 3912173.392696 | 4352478.355209 | 14850656.138963 | 67097.808296 | 6709780.83488 | NaN | NaN | NaN | NaN | 538.832227 | 1.821452 |
| min | 1.0 | 2016.0 | NaN | NaN | NaN | NaN | NaN | NaN | 98006.0 | NaN | 1.0 | NaN | 47.49917 | -122.41425 | 1900.0 | 0.0 | 0.0 | 11285.0 | 0.0 | 3636.0 | NaN | NaN | 5656.0 | NaN | 0.0 | NaN | 0.0 | NaN | 1.0 | 0.0 | 0.0 | 0.0 | -2.1 | 0.0 | 0.0 | 0.0 | -33826.80078 | -115417.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | -0.8 | -0.02 |
| 25% | 19990.75 | 2016.0 | NaN | NaN | NaN | NaN | NaN | NaN | 98105.0 | NaN | 3.0 | NaN | 47.59986 | -122.350662 | 1948.0 | 1.0 | 2.0 | 28487.0 | 0.0 | 27756.0 | NaN | NaN | 25094.75 | NaN | 5000.0 | NaN | 2239.0 | NaN | 53.0 | 27.9 | 29.4 | 74.699997 | 78.400002 | 925128.59375 | 970182.234375 | 0.0 | 187422.94535 | 639487.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | 9.495 | 0.21 |
| 50% | 23112.0 | 2016.0 | NaN | NaN | NaN | NaN | NaN | NaN | 98115.0 | NaN | 4.0 | NaN | 47.618675 | -122.332495 | 1975.0 | 1.0 | 4.0 | 44175.0 | 0.0 | 43216.0 | NaN | NaN | 39894.0 | NaN | 10664.0 | NaN | 5043.0 | NaN | 75.0 | 38.599998 | 40.900002 | 96.199997 | 101.099998 | 1803753.25 | 1904452.0 | 0.0 | 345129.9063 | 1177583.0 | 3237.537598 | 323754.0 | NaN | NaN | NaN | NaN | 33.92 | 0.61 |
| 75% | 25994.25 | 2016.0 | NaN | NaN | NaN | NaN | NaN | NaN | 98122.0 | NaN | 7.0 | NaN | 47.657115 | -122.319407 | 1997.0 | 1.0 | 5.0 | 90992.0 | 0.0 | 84276.25 | NaN | NaN | 76200.25 | NaN | 26640.0 | NaN | 10138.75 | NaN | 90.0 | 60.400002 | 64.275002 | 143.899994 | 148.349998 | 4222455.25 | 4381429.125 | 0.0 | 829317.84375 | 2829632.5 | 11890.33496 | 1189033.5 | NaN | NaN | NaN | NaN | 93.94 | 1.37 |
| max | 50226.0 | 2016.0 | NaN | NaN | NaN | NaN | NaN | NaN | 98272.0 | NaN | 7.0 | NaN | 47.73387 | -122.220966 | 2015.0 | 111.0 | 99.0 | 9320156.0 | 512608.0 | 9320156.0 | NaN | NaN | 9320156.0 | NaN | 686750.0 | NaN | 459748.0 | NaN | 100.0 | 834.400024 | 834.400024 | 2620.0 | 2620.0 | 873923712.0 | 471613856.0 | 134943456.0 | 192577488.0 | 657074389.0 | 2979090.0 | 297909000.0 | NaN | NaN | NaN | NaN | 16870.98 | 34.09 |
# Function to check and remove all columns full of 0 or null values
def columns_null_zero(df):
null_zero_df = ((df == 0) | (df.isnull())).sum().reset_index()
null_zero_df.columns = ['Columns_name', 'Numbers_null_zero']
print('Number of columns with all values zero or null equal to',
null_zero_df[null_zero_df['Numbers_null_zero'] == df.shape[0]].shape[0])
df_reduced = df[list(null_zero_df.loc[null_zero_df['Numbers_null_zero'] != df.shape[0],
'Columns_name'].values)]
print('After removing those columns, the number of columns left =',
df_reduced.shape[1])
return df_reduced
building_df = columns_null_zero(building_df)
Number of columns with all values zero or null equal to 1 After removing those columns, the number of columns left = 45
building_df = building_df.drop(['YearsENERGYSTARCertified'], axis=1)
building_df = building_df.drop(['Address', 'City', 'State',
'TaxParcelIdentificationNumber',
'DefaultData', 'ComplianceStatus'], axis=1)
building_df = building_df.drop(['SiteEUIWN(kBtu/sf)',
'SourceEUIWN(kBtu/sf)',
'SiteEnergyUseWN(kBtu)'], axis=1)
building_df = building_df.drop(['Electricity(kWh)',
'NaturalGas(therms)'], axis=1)
# Function that remove duplicated rows on selected columns
def removing_duplicates(df, df_columns):
print('The number of duplicated rows on selected columns equal to',
df.loc[df[df_columns].duplicated(keep=False), :].shape[0])
df.drop_duplicates(subset=df_columns, inplace=True, ignore_index=True)
print('After removing those rows, we are left with this number of rows:', df.shape[0])
return df
building_df = removing_duplicates(building_df, 'OSEBuildingID')
The number of duplicated rows on selected columns equal to 0 After removing those rows, we are left with this number of rows: 3376
# Function to check and remove all rows of selected column(s) full of 0 or null values
def rows_zero_null_values(df, df_columns):
# For multiple columns
if np.atleast_2d(df_columns.T).shape[0] > 1:
print('The number of rows with all values equal to 0 or null in the selected columns equal to',
df[((df_columns == 0) | (df_columns.isnull())).all(axis=1)].shape[0])
df_cleaned = df[((df_columns != 0) & (df_columns.notnull())).any(axis=1)]
print('After removing those rows, we are left with this number of rows:',
df_cleaned.shape[0])
return df_cleaned
# For single column
elif np.atleast_2d(df_columns.T).shape[0] == 1:
print('The number of rows with all values equal to 0 or null in the selected column equal to',
df[(df_columns == 0) | (df_columns.isnull())].shape[0])
df.drop(df[(df_columns == 0) | (df_columns.isnull())].index, inplace=True)
print('After removing those rows, we are left with this number of rows:',
df.shape[0])
return df
else:
print('please enter the column(s) name into the function')
columns_list = ['ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SourceEUI(kBtu/sf)',
'SiteEnergyUse(kBtu)', 'SteamUse(kBtu)', 'Electricity(kBtu)',
'NaturalGas(kBtu)', 'TotalGHGEmissions', 'GHGEmissionsIntensity']
building_df = rows_zero_null_values(building_df, building_df[columns_list])
The number of rows with all values equal to 0 or null in the selected columns equal to 11 After removing those rows, we are left with this number of rows: 3365
print('The number of buildings with no entry on SiteEnergyUse(kBtu) or TotalGHGEmissions equal to',
building_df[((building_df['SiteEnergyUse(kBtu)'].isnull())
| (building_df['TotalGHGEmissions'].isnull()))].shape[0])
building_df = building_df[~((building_df['SiteEnergyUse(kBtu)'].isnull())
| (building_df['TotalGHGEmissions'].isnull()))]
print('After removing those buildings, we are left with this number of rows:',
building_df.shape[0])
The number of buildings with no entry on SiteEnergyUse(kBtu) or TotalGHGEmissions equal to 4 After removing those buildings, we are left with this number of rows: 3361
columns_list = ['NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal',
'PropertyGFAParking', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA', 'ENERGYSTARScore',
'SiteEUI(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)',
'SteamUse(kBtu)', 'Electricity(kBtu)', 'NaturalGas(kBtu)',
'TotalGHGEmissions', 'GHGEmissionsIntensity']
print('Number of nuildings with negative values in any numerical feature:',
building_df[(building_df[columns_list] < 0).any(axis=1)].shape[0])
condition = building_df[(building_df[columns_list] < 0).any(axis=1)]
building_df.drop(condition.index, inplace=True)
print('After removing those buildings, we are left with this number of rows:',
building_df.shape[0])
Number of nuildings with negative values in any numerical feature: 1 After removing those buildings, we are left with this number of rows: 3360
Columns outliers classify a building as high or low outlier depending if the building energy use intensity EUI is in the 5% highest intensity or in the 5% lowest intensity respectively. We are going to drop those low and high building from the database. Then drop the outliers column.
print('The number of buildings with the highest or lowest 5% in terms of EUI equal to',
building_df[building_df['Outlier'].notna()].shape[0])
building_df = building_df[building_df['Outlier'].isna()]
print('After removing those buildings, we are left with this number of rows:',
building_df.shape[0])
building_df = building_df.drop(['Outlier'], axis=1)
The number of buildings with the highest or lowest 5% in terms of EUI equal to 32 After removing those buildings, we are left with this number of rows: 3328
columns_list = ['PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']
building_df[columns_list].plot(kind='box', subplots=True,
figsize=(20, 5), layout=(1, 6))
plt.show()
columns_list = ['SiteEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SteamUse(kBtu)',
'Electricity(kBtu)', 'NaturalGas(kBtu)']
building_df[columns_list].plot(kind='box', subplots=True,
figsize=(15, 5), layout=(1, 5))
plt.show()
columns_list = ['TotalGHGEmissions','GHGEmissionsIntensity']
building_df[columns_list].plot(kind='box', subplots=True,
figsize=(10, 3), layout=(1, 2))
plt.show()
Dropping all outliers appearing at least 2 times outside the interval Q1-5IQR -- Q3+5IQR on columns 'PropertyGFATotal', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA','SiteEUI(kBtu/sf)','SourceEUI(kBtu/sf)','SiteEnergyUse(kBtu)','Electricity(kBtu)','NaturalGas(kBtu)','TotalGHGEmissions','GHGEmissionsIntensity'
# Function that check for outliers on multiple columns and count the number of times an outlier appeared.
# If an outlier appeared at least n times, we drop that outlier from the dataset.
def Outliers_Limits(df, col_list, n):
indexes_outliers = []
# Checking for outliers and getting their index
for col in col_list:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 5*IQR
upper_limit = Q3 + 5*IQR
list_outliers = df[(df[col] < lower_limit) | (df[col] > upper_limit)].index
indexes_outliers.extend(list_outliers)
# Counting the number of time an outlier appeared
indexes_outliers = Counter(indexes_outliers)
# Selection of outliers appearing at least n time
multiple_outliers = list( k for k, v in indexes_outliers.items() if v > n )
print('Number of outliers to remove: {}'.format(len(multiple_outliers)))
# Dropping those outliers that appeared at least n time
df.drop(multiple_outliers, axis=0, inplace=True)
columns_list = ['PropertyGFATotal', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'SiteEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'Electricity(kBtu)',
'NaturalGas(kBtu)', 'TotalGHGEmissions', 'GHGEmissionsIntensity']
Outliers_Limits(building_df, columns_list, 1)
print('After removing those outliers, we are left with this number of buildings:',
building_df.shape[0])
Number of outliers to remove: 216 After removing those outliers, we are left with this number of buildings: 3112
building_df[['NumberofBuildings', 'NumberofFloors']].plot(kind='box', subplots=True,
figsize=(10, 3), layout=(1, 2))
plt.show()
building_df[(building_df['NumberofFloors'] > 50)]
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | ZipCode | CouncilDistrictCode | Neighborhood | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1356 | 21611 | 2016 | NonResidential | Worship Facility | Seattle Chinese Baptist Church | 98108.0 | 2 | GREATER DUWAMISH | 47.55072 | -122.30265 | 1977 | 1.0 | 99 | 21948 | 0 | 21948 | Worship Facility | Worship Facility | 21948.0 | NaN | NaN | NaN | NaN | 80.0 | 14.9 | 46.599998 | 326001.1875 | 0.0 | 326001.0 | 0.0 | 2.27 | 0.1 |
Changing the 99 floor Seattle Chinese Baptist Church to NaN.
building_df.loc[building_df[(building_df['NumberofFloors'] > 80)].index,
'NumberofFloors'] = np.nan
print('Percentage of NaN in the dataset equal to',
sum(building_df.isna().sum()/building_df.shape[0]*100)/building_df.shape[1])
building_df = building_df.reset_index(drop=True)
df_description(building_df)
Percentage of NaN in the dataset equal to 9.267553020565552
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | ZipCode | CouncilDistrictCode | Neighborhood | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| type | int64 | int64 | object | object | object | float64 | int64 | object | float64 | float64 | int64 | float64 | float64 | int64 | int64 | int64 | object | object | float64 | object | float64 | object | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 |
| number null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 11.0 | 1611.0 | 1611.0 | 2614.0 | 2614.0 | 743.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| percentage null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.417738 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.032134 | 0.0 | 0.0 | 0.0 | 0.0 | 0.35347 | 0.35347 | 51.767352 | 51.767352 | 83.997429 | 83.997429 | 23.875321 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| count | 3112.0 | 3112.0 | 3112 | 3112 | 3112 | 3099.0 | 3112.0 | 3112 | 3112.0 | 3112.0 | 3112.0 | 3112.0 | 3111.0 | 3112.0 | 3112.0 | 3112.0 | 3112 | 3101 | 3101.0 | 1501 | 1501.0 | 498 | 498.0 | 2369.0 | 3112.0 | 3112.0 | 3112.0 | 3112.0 | 3112.0 | 3112.0 | 3112.0 | 3112.0 |
| unique | NaN | NaN | 8 | 24 | 3099 | NaN | NaN | 19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 401 | 53 | NaN | 49 | NaN | 39 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | NaN | NaN | NonResidential | Low-Rise Multifamily | Northgate Plaza | NaN | NaN | DOWNTOWN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Multifamily Housing | Multifamily Housing | NaN | Parking | NaN | Retail Store | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | NaN | NaN | 1279 | 967 | 3 | NaN | NaN | 476 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 856 | 1625 | NaN | 865 | NaN | 98 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | 21667.851864 | 2016.0 | NaN | NaN | NaN | 98117.242336 | 4.390424 | NaN | 47.624493 | -122.334948 | 1967.977828 | 1.034383 | 4.124076 | 67549.784383 | 5091.654884 | 62458.129499 | NaN | NaN | 56607.588842 | NaN | 18011.70646 | NaN | 6926.24759 | 68.173913 | 48.143895 | 119.213335 | 2872234.17164 | 60307.827257 | 2006893.78926 | 802850.264794 | 61.285254 | 0.994463 |
| std | 11558.326719 | 0.0 | NaN | NaN | NaN | 18.667169 | 2.109707 | NaN | 0.048476 | 0.027534 | 33.089242 | 0.542421 | 3.329885 | 64810.956063 | 20219.872134 | 56000.109064 | NaN | NaN | 51373.874999 | NaN | 23894.681177 | NaN | 8832.247612 | 26.278591 | 36.027207 | 88.403633 | 3209253.276245 | 458798.683153 | 2467575.084181 | 1282110.397239 | 83.670567 | 1.170869 |
| min | 1.0 | 2016.0 | NaN | NaN | NaN | 98006.0 | 1.0 | NaN | 47.49917 | -122.41425 | 1900.0 | 0.0 | 0.0 | 11285.0 | 0.0 | 10925.0 | NaN | NaN | 5656.0 | NaN | 0.0 | NaN | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 25% | 20241.75 | 2016.0 | NaN | NaN | NaN | 98105.0 | 3.0 | NaN | 47.599535 | -122.351722 | 1948.0 | 1.0 | 2.0 | 27909.5 | 0.0 | 27215.75 | NaN | NaN | 24696.0 | NaN | 4752.0 | NaN | 2161.75 | 53.0 | 27.6 | 73.900002 | 903481.734375 | 0.0 | 624456.5 | 0.0 | 8.9975 | 0.2 |
| 50% | 23272.5 | 2016.0 | NaN | NaN | NaN | 98115.0 | 4.0 | NaN | 47.619415 | -122.332185 | 1974.0 | 1.0 | 4.0 | 42170.5 | 0.0 | 41173.5 | NaN | NaN | 37581.0 | NaN | 9710.0 | NaN | 4579.0 | 75.0 | 37.299999 | 94.049999 | 1666212.3125 | 0.0 | 1081909.5 | 289657.5 | 29.905 | 0.595 |
| 75% | 26020.0 | 2016.0 | NaN | NaN | NaN | 98122.0 | 7.0 | NaN | 47.658473 | -122.318938 | 1996.0 | 1.0 | 5.0 | 78380.75 | 0.0 | 72730.0 | NaN | NaN | 65689.0 | NaN | 21203.0 | NaN | 8480.25 | 89.0 | 56.624999 | 133.199997 | 3477591.25 | 0.0 | 2310250.0 | 1046129.5 | 74.33 | 1.28 |
| max | 50226.0 | 2016.0 | NaN | NaN | NaN | 98272.0 | 7.0 | NaN | 47.73387 | -122.220966 | 2015.0 | 16.0 | 42.0 | 501239.0 | 272900.0 | 370327.0 | NaN | NaN | 375502.0 | NaN | 298043.0 | NaN | 79507.0 | 100.0 | 335.5 | 781.299988 | 20723248.0 | 7815663.0 | 20361156.0 | 9190227.0 | 692.12 | 10.54 |
plt.pie(building_df.groupby(by='BuildingType').size(),
labels=building_df.groupby(by='BuildingType').size().index,
startangle=0,
shadow=True,
autopct='%1.1f%%')
plt.title('Building Type Distribution', fontsize=15, fontweight='bold')
plt.show()
condition = building_df['BuildingType'].isin([hab for hab in building_df['BuildingType'].unique() if hab.startswith('Multifamily')])
print('Multifamily buildings to be dropped equal to', building_df[condition].shape[0])
building_df = building_df[~condition]
print('After selecting the non-residential buildings, we are left with {} buildings.'
.format(building_df.shape[0]))
plt.pie(building_df.groupby(by='BuildingType').size(),
labels=building_df.groupby(by='BuildingType').size().index,
startangle=0,
shadow=True,
autopct='%1.1f%%')
plt.title('Building Type Distribution', fontsize=15, fontweight='bold')
plt.show()
Multifamily buildings to be dropped equal to 1656 After selecting the non-residential buildings, we are left with 1456 buildings.
display((building_df[['BuildingType', 'PrimaryPropertyType']])
.groupby(['BuildingType', 'PrimaryPropertyType']).size())
BuildingType PrimaryPropertyType
Campus K-12 School 4
Low-Rise Multifamily 1
Mixed Use Property 2
Other 3
University 1
NonResidential Distribution Center 49
Hospital 1
Hotel 63
K-12 School 38
Laboratory 3
Large Office 109
Low-Rise Multifamily 1
Medical Office 31
Mixed Use Property 85
Other 156
Refrigerated Warehouse 12
Residence Hall 21
Restaurant 9
Retail Store 83
Self-Storage Facility 28
Senior Care Community 14
Small- and Mid-Sized Office 282
Supermarket / Grocery Store 33
University 14
Warehouse 178
Worship Facility 69
Nonresidential COS Distribution Center 2
Large Office 1
Medical Office 1
Mixed Use Property 8
Office 3
Other 45
Small- and Mid-Sized Office 4
Warehouse 7
Nonresidential WA University 1
SPS-District K-12 K-12 School 93
Other 1
dtype: int64
We still see some multifamily classification as well residence hall in PrimaryPropertyType; We are going to drop those buildings as well.
condition = building_df['PrimaryPropertyType']\
.isin([hab for hab in building_df['PrimaryPropertyType'].unique() if ((hab.endswith('Multifamily'))
| (hab == 'Hotel')
| (hab == 'Residence Hall'))])
print('Left over multifamily buildings to be dropped equal to',
building_df[condition].shape[0])
building_df = building_df[~condition]
print('After removing those buildings, we are left with {} buildings.'
.format(building_df.shape[0]))
print(building_df.groupby(by='PrimaryPropertyType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
plt.pie(building_df.groupby(by='PrimaryPropertyType').size(),
labels=building_df.groupby(by='PrimaryPropertyType').size().index,
startangle=0,
shadow=True,
autopct='%1.1f%%')
plt.title('PrimaryPropertyType Distribution', fontsize=15, fontweight='bold')
plt.show()
Left over multifamily buildings to be dropped equal to 86 After removing those buildings, we are left with 1370 buildings. PrimaryPropertyType Small- and Mid-Sized Office 286 Other 205 Warehouse 185 K-12 School 135 Large Office 110 Mixed Use Property 95 Retail Store 83 Worship Facility 69 Distribution Center 51 Supermarket / Grocery Store 33 Medical Office 32 Self-Storage Facility 28 University 16 Senior Care Community 14 Refrigerated Warehouse 12 Restaurant 9 Office 3 Laboratory 3 Hospital 1 Name: OSEBuildingID, dtype: int64
We see low occurence of Hospital, Laboratory, Office, and Restaurant with entries of less than 10 buildings. Those can lower the performance of the ML models. We will treat them later.
building_df[['NumberofBuildings', 'NumberofFloors']].plot(kind='box', subplots=True,
figsize=(10, 3), layout=(1, 2))
plt.show()
building_df[['PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']].plot(kind='box', subplots=True,
figsize=(20, 5), layout=(1, 6))
plt.show()
columns_list = ['SiteEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SteamUse(kBtu)',
'Electricity(kBtu)', 'NaturalGas(kBtu)']
building_df[columns_list].plot(kind='box', subplots=True,
figsize=(15, 5), layout=(1, 5))
plt.show()
columns_list = ['TotalGHGEmissions','GHGEmissionsIntensity']
building_df[columns_list].plot(kind='box', subplots=True,
figsize=(10, 3), layout=(1, 2))
plt.show()
We still have some outliers, we will remove again the outliers that appear at least 2 times in those columns:
columns_list = ['PropertyGFATotal', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'SiteEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)',
'Electricity(kBtu)', 'NaturalGas(kBtu)',
'TotalGHGEmissions', 'GHGEmissionsIntensity']
Outliers_Limits(building_df, columns_list, 1)
Number of outliers to remove: 22
# Limiting SiteEnergyUse to 2*1E7
condition = building_df[building_df['SiteEnergyUse(kBtu)'] > 2*1E7]
building_df.drop(condition.index, inplace=True)
print('Percentage of NaN in the dataset equal to',
sum(building_df.isna().sum()/building_df.shape[0]*100)/building_df.shape[1])
df_description(building_df)
Percentage of NaN in the dataset equal to 9.419079435783223
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | ZipCode | CouncilDistrictCode | Neighborhood | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| type | int64 | int64 | object | object | object | float64 | int64 | object | float64 | float64 | int64 | float64 | float64 | int64 | int64 | int64 | object | object | float64 | object | float64 | object | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 |
| number null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 3.0 | 686.0 | 686.0 | 1095.0 | 1095.0 | 478.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| percentage null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.965108 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.074239 | 0.0 | 0.0 | 0.0 | 0.0 | 0.222717 | 0.222717 | 50.927988 | 50.927988 | 81.291759 | 81.291759 | 35.486266 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| count | 1347.0 | 1347.0 | 1347 | 1347 | 1347 | 1334.0 | 1347.0 | 1347 | 1347.0 | 1347.0 | 1347.0 | 1347.0 | 1346.0 | 1347.0 | 1347.0 | 1347.0 | 1347 | 1344 | 1344.0 | 661 | 661.0 | 252 | 252.0 | 869.0 | 1347.0 | 1347.0 | 1347.0 | 1347.0 | 1347.0 | 1347.0 | 1347.0 | 1347.0 |
| unique | NaN | NaN | 5 | 19 | 1344 | NaN | NaN | 19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 286 | 53 | NaN | 44 | NaN | 34 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | NaN | NaN | NonResidential | Small- and Mid-Sized Office | Airport Way | NaN | NaN | GREATER DUWAMISH | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | K-12 School | Office | NaN | Parking | NaN | Parking | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | NaN | NaN | 1174 | 286 | 2 | NaN | NaN | 322 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 129 | 414 | NaN | 214 | NaN | 40 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | 17187.165553 | 2016.0 | NaN | NaN | NaN | 98117.488006 | 4.161841 | NaN | 47.614698 | -122.333239 | 1959.641425 | 1.023756 | 2.780832 | 65174.786192 | 6764.364514 | 58410.421678 | NaN | NaN | 54253.755208 | NaN | 16763.465048 | NaN | 7054.203572 | 65.348677 | 59.661025 | 148.66971 | 3384622.749172 | 81303.589261 | 2459305.945195 | 848471.050511 | 68.482791 | 1.246815 |
| std | 12807.524486 | 0.0 | NaN | NaN | NaN | 18.030925 | 2.182042 | NaN | 0.050516 | 0.025706 | 32.102661 | 0.447247 | 2.442856 | 59012.131437 | 23752.531251 | 47504.390947 | NaN | NaN | 47101.780124 | NaN | 21179.116206 | NaN | 7709.44189 | 28.122046 | 47.243932 | 120.931558 | 3547493.744521 | 506671.55175 | 2952703.483437 | 1229355.699077 | 81.919648 | 1.377688 |
| min | 9.0 | 2016.0 | NaN | NaN | NaN | 98006.0 | 1.0 | NaN | 47.49917 | -122.41182 | 1900.0 | 0.0 | 0.0 | 11285.0 | 0.0 | 10925.0 | NaN | NaN | 5656.0 | NaN | 0.0 | NaN | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 25% | 706.5 | 2016.0 | NaN | NaN | NaN | 98105.0 | 2.0 | NaN | 47.57887 | -122.345565 | 1929.0 | 1.0 | 1.0 | 27857.0 | 0.0 | 26835.0 | NaN | NaN | 24000.0 | NaN | 4800.0 | NaN | 2492.5 | 49.0 | 31.0 | 69.599998 | 1085615.9375 | 0.0 | 642532.5 | 0.0 | 17.06 | 0.33 |
| 50% | 21425.0 | 2016.0 | NaN | NaN | NaN | 98115.0 | 4.0 | NaN | 47.61159 | -122.33248 | 1963.0 | 1.0 | 2.0 | 43200.0 | 0.0 | 41262.0 | NaN | NaN | 38019.0 | NaN | 9616.0 | NaN | 5000.0 | 73.0 | 47.599998 | 119.0 | 2017967.75 | 0.0 | 1294767.0 | 390854.0 | 39.05 | 0.79 |
| 75% | 24629.5 | 2016.0 | NaN | NaN | NaN | 98125.0 | 7.0 | NaN | 47.651085 | -122.321155 | 1986.0 | 1.0 | 3.0 | 75548.0 | 0.0 | 67750.0 | NaN | NaN | 64769.0 | NaN | 18590.0 | NaN | 9003.75 | 88.0 | 73.25 | 185.600006 | 4339392.25 | 0.0 | 3049940.0 | 1081797.5 | 83.62 | 1.58 |
| max | 50226.0 | 2016.0 | NaN | NaN | NaN | 98199.0 | 7.0 | NaN | 47.73387 | -122.25864 | 2015.0 | 6.0 | 27.0 | 378002.0 | 272900.0 | 323916.0 | NaN | NaN | 325424.0 | NaN | 139000.0 | NaN | 58152.0 | 100.0 | 335.5 | 781.299988 | 19295226.0 | 7413886.0 | 19036413.0 | 7116213.0 | 639.72 | 10.54 |
# Let's start with resetting the index of our database
building_df = building_df.reset_index(drop=True)
# Creating a database where 'ZipCode', 'Latitude', and 'Longitude' are not null
columns_list = ['ZipCode', 'Latitude', 'Longitude']
columns_list_data = ['Latitude', 'Longitude']
columns_list_target = 'ZipCode'
data = building_df[(building_df[columns_list].notna()).all(axis=1)].reset_index(drop=True)
# Selecting 100% of that selected database
sample_index = np.random.randint(data.shape[0], size=int(data.shape[0]*1))
sample_data = data[columns_list_data].iloc[sample_index].reset_index(drop=True)
sample_target = data[columns_list_target].iloc[sample_index].reset_index(drop=True)
# Divide the data into 80% training and 20% testing
xtrain, xtest, ytrain, ytest = train_test_split(sample_data, sample_target, train_size=0.8)
# Fitting using RandomForestClassifier
classifier = RandomForestClassifier(n_jobs=-1)
classifier.fit(xtrain, ytrain)
print('Error using RandomForestClassifier equal to', (1 - classifier.score(xtest, ytest)))
Error using RandomForestClassifier equal to 0.041198501872659166
We got low error from the classifier, we will proceed to fill the missing values in zipcode:
# Creating a database where only null in 'ZipCode', and no null value on 'Latitude' and 'Longitude'
data_null = building_df[((building_df[columns_list_data].notna()).all(axis=1))
& (building_df[columns_list_target].isna())]
data_null = data_null[columns_list_data]
data_null_index = data_null.index
data_null.columns = columns_list_data
# Predicting the value of 'ZipCode' from 'Latitude' and 'Longitude' using the model
data_null[columns_list_target] = classifier.predict(data_null)
# Adding those values to the main database
building_df.loc[data_null_index,columns_list_target] = data_null[columns_list_target]
print(building_df.groupby(by='BuildingType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
print(building_df.groupby(by='PrimaryPropertyType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
print(building_df.groupby(by='LargestPropertyUseType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
BuildingType NonResidential 1174 SPS-District K-12 92 Nonresidential COS 70 Campus 10 Nonresidential WA 1 Name: OSEBuildingID, dtype: int64 PrimaryPropertyType Small- and Mid-Sized Office 286 Other 203 Warehouse 185 K-12 School 133 Large Office 99 Mixed Use Property 94 Retail Store 81 Worship Facility 69 Distribution Center 51 Supermarket / Grocery Store 32 Medical Office 29 Self-Storage Facility 28 University 15 Senior Care Community 14 Refrigerated Warehouse 12 Restaurant 9 Office 3 Laboratory 3 Hospital 1 Name: OSEBuildingID, dtype: int64 LargestPropertyUseType Office 414 Non-Refrigerated Warehouse 197 K-12 School 133 Retail Store 89 Other 80 Worship Facility 69 Distribution Center 52 Supermarket/Grocery Store 33 Medical Office 31 Self-Storage Facility 27 Other - Recreation 22 Parking 19 Other - Entertainment/Public Assembly 17 College/University 15 Senior Care Community 14 Refrigerated Warehouse 12 Social/Meeting Hall 10 Multifamily Housing 8 Restaurant 8 Manufacturing/Industrial Plant 7 Repair Services (Vehicle, Shoe, Locksmith, etc) 6 Automobile Dealership 5 Fitness Center/Health Club/Gym 5 Other - Services 5 Strip Mall 5 Other - Lodging/Residential 5 Financial Office 4 Bank Branch 4 Museum 4 Laboratory 4 Prison/Incarceration 3 Performing Arts 3 Other - Mall 3 Other/Specialty Hospital 3 Library 3 Other - Public Services 2 Urgent Care/Clinic/Other Outpatient 2 Other - Utility 2 Other - Education 2 Adult Education 2 Other - Restaurant/Bar 2 Pre-school/Daycare 2 Residence Hall/Dormitory 1 Residential Care Facility 1 Lifestyle Center 1 Police Station 1 Hotel 1 Hospital (General Medical & Surgical) 1 Food Service 1 Fire Station 1 Personal Services (Health/Beauty, Dry Cleaning, etc) 1 Wholesale Club/Supercenter 1 Movie Theater 1 Name: OSEBuildingID, dtype: int64
# Merging the low occurance of some BuildingType with other high occurance
building_df.loc[(building_df[building_df['BuildingType'] == 'Nonresidential WA'].index),
'BuildingType'] = 'NonResidential'
# Replacing the low occurance of some PrimaryPropertyType by NaN
cat_df = pd.DataFrame(building_df.groupby(by='PrimaryPropertyType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
cat_list = list(cat_df[cat_df['OSEBuildingID'] <= 5].index)
for cat in cat_list:
building_df.loc[(building_df[building_df['PrimaryPropertyType'] == cat].index),
'PrimaryPropertyType'] = np.nan
# Replacing the low occurance of some LargestPropertyUseType by NaN
cat_df = pd.DataFrame(building_df.groupby(by='LargestPropertyUseType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
cat_list = list(cat_df[cat_df['OSEBuildingID'] <= 5].index)
for cat in cat_list:
building_df.loc[(building_df[building_df['LargestPropertyUseType'] == cat].index),
'LargestPropertyUseType'] = np.nan
# building_df[building_df[['PrimaryPropertyType']].isna]
display((building_df[['PrimaryPropertyType',
'LargestPropertyUseType']].notna()).groupby(['PrimaryPropertyType',
'LargestPropertyUseType']).size())
PrimaryPropertyType LargestPropertyUseType
False False 4
True 3
True False 80
True 1260
dtype: int64
index_check = building_df[(building_df['PrimaryPropertyType'].notna()
& building_df['LargestPropertyUseType'].isna())].index
building_df.loc[index_check,
'LargestPropertyUseType'] = building_df.loc[index_check,
'PrimaryPropertyType'].values
index_check = building_df[(building_df['PrimaryPropertyType'].isna()
& building_df['LargestPropertyUseType'].notna())].index
building_df.loc[index_check,
'PrimaryPropertyType'] = building_df.loc[index_check,
'LargestPropertyUseType'].values
print(building_df.groupby(by='PrimaryPropertyType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
print(building_df.groupby(by='LargestPropertyUseType').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
PrimaryPropertyType Small- and Mid-Sized Office 286 Other 203 Warehouse 185 K-12 School 133 Large Office 99 Mixed Use Property 94 Retail Store 81 Worship Facility 69 Distribution Center 51 Supermarket / Grocery Store 32 Medical Office 29 Self-Storage Facility 28 University 15 Senior Care Community 14 Refrigerated Warehouse 12 Restaurant 9 Office 3 Name: OSEBuildingID, dtype: int64 LargestPropertyUseType Office 414 Non-Refrigerated Warehouse 197 Other 145 K-12 School 133 Retail Store 89 Worship Facility 69 Distribution Center 52 Supermarket/Grocery Store 33 Medical Office 31 Self-Storage Facility 28 Other - Recreation 22 Parking 19 Other - Entertainment/Public Assembly 17 College/University 15 Senior Care Community 14 Refrigerated Warehouse 12 Restaurant 10 Mixed Use Property 10 Social/Meeting Hall 10 Multifamily Housing 8 Manufacturing/Industrial Plant 7 Repair Services (Vehicle, Shoe, Locksmith, etc) 6 Small- and Mid-Sized Office 1 Supermarket / Grocery Store 1 Name: OSEBuildingID, dtype: int64
# Merging the low occurance of some BuildingType with other high occurance
building_df.loc[(building_df[building_df['PrimaryPropertyType'] == 'Office'].index),
'PrimaryPropertyType'] = 'Small- and Mid-Sized Office'
building_df.loc[(building_df[building_df['LargestPropertyUseType'] == 'Supermarket / Grocery Store'].index),
'LargestPropertyUseType'] = 'Supermarket/Grocery Store'
building_df.loc[(building_df[building_df['LargestPropertyUseType'] == 'Small- and Mid-Sized Office'].index),
'LargestPropertyUseType'] = 'Office'
building_df.loc[(building_df[building_df[['PrimaryPropertyType',
'LargestPropertyUseType']].isna().any(axis=1)].index),
['PropertyName', 'BuildingType', 'ListOfAllPropertyUseTypes',
'PrimaryPropertyType', 'LargestPropertyUseType', 'SecondLargestPropertyUseType']]
| PropertyName | BuildingType | ListOfAllPropertyUseTypes | PrimaryPropertyType | LargestPropertyUseType | SecondLargestPropertyUseType | |
|---|---|---|---|---|---|---|
| 190 | Rosen Building - SEDO | NonResidential | Data Center, Laboratory | NaN | NaN | Data Center |
| 456 | 1100 Olive Lab | NonResidential | Laboratory | NaN | NaN | NaN |
| 1116 | Park West Skilled Nursing | NonResidential | Hospital (General Medical & Surgical) | NaN | NaN | NaN |
| 1268 | Environmental Laboratory | NonResidential | Laboratory, Other | NaN | NaN | Other |
Since they belong to Laboratory and Hospital as PrimaryPropertyType and LargestPropertyUseType, and because these two categories are not represented in the database, we will drop these 4 buildings.
condition = building_df[building_df[['PrimaryPropertyType', 'LargestPropertyUseType']].isna().any(axis=1)]
building_df.drop(condition.index, inplace=True)
building_df['NumberofBuildings'].replace(0, np.nan, inplace=True)
building_df['NumberofFloors'].replace(0, np.nan, inplace=True)
building_df['NumberofBuildings'].fillna(building_df.groupby('PrimaryPropertyType')['NumberofBuildings'].transform('median').round(0), inplace=True)
building_df['NumberofFloors'].fillna(building_df.groupby('PrimaryPropertyType')['NumberofFloors'].transform('median').round(0), inplace=True)
print((building_df['PropertyGFATotal']/(building_df['PropertyGFAParking']+building_df['PropertyGFABuilding(s)'])).describe())
plt.figure(figsize=(3, 3))
plt.plot(building_df['PropertyGFAParking'], building_df['PropertyGFABuilding(s)'], 'o')
plt.xlabel('PropertyGFAParking')
plt.ylabel('PropertyGFABuilding(s)')
plt.show()
count 1343.0 mean 1.0 std 0.0 min 1.0 25% 1.0 50% 1.0 75% 1.0 max 1.0 dtype: float64
PropertyGFATotal equal to PropertyGFAParking+PropertyGFABuilding(s).
building_df[building_df['LargestPropertyUseTypeGFA'].isna()]
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | ZipCode | CouncilDistrictCode | Neighborhood | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 204 | 496 | 2016 | NonResidential | Self-Storage Facility | Market St Center | 98107.0 | 6 | BALLARD | 47.66838 | -122.39310 | 1946 | 2.0 | 2.0 | 111445 | 0 | 111445 | Fitness Center/Health Club/Gym, Office, Other ... | Self-Storage Facility | NaN | NaN | NaN | NaN | NaN | NaN | 47.799999 | 103.000000 | 5.697472e+06 | 0.0 | 3007514.0 | 2689958.0 | 163.83 | 1.47 |
| 1094 | 25568 | 2016 | NonResidential | Small- and Mid-Sized Office | Talon Northlake LLC | 98103.0 | 4 | LAKE UNION | 47.64747 | -122.34086 | 2008 | 1.0 | 4.0 | 48350 | 0 | 48350 | Office | Office | NaN | NaN | NaN | NaN | NaN | 45.0 | 72.199997 | 226.600006 | 3.168131e+06 | 0.0 | 3168131.0 | 0.0 | 22.09 | 0.46 |
| 1111 | 25711 | 2016 | NonResidential | Restaurant | BUSH GARDEN - RESTURANT & LOUNGE | 98104.0 | 2 | DOWNTOWN | 47.59697 | -122.32474 | 1913 | 1.0 | 3.0 | 28800 | 0 | 28800 | Restaurant | Restaurant | NaN | NaN | NaN | NaN | NaN | NaN | 31.200001 | 62.000000 | 8.999242e+05 | 0.0 | 402907.0 | 497017.0 | 29.21 | 1.01 |
Dropping building ID 496 and filling the other two with the value in PropertyGFATotal
building_df.drop(building_df[building_df['OSEBuildingID'] == 496].index, inplace=True)
building_df.loc[building_df[building_df['LargestPropertyUseTypeGFA'].isna()].index,
'LargestPropertyUseTypeGFA'] = building_df.loc[building_df[building_df['LargestPropertyUseTypeGFA'].isna()].index,
'PropertyGFATotal']
display((building_df[['SecondLargestPropertyUseType',
'SecondLargestPropertyUseTypeGFA']].notna()).groupby(['SecondLargestPropertyUseType',
'SecondLargestPropertyUseTypeGFA']).size())
display((building_df[['ThirdLargestPropertyUseType',
'ThirdLargestPropertyUseTypeGFA']].notna()).groupby(['ThirdLargestPropertyUseType',
'ThirdLargestPropertyUseTypeGFA']).size())
display((building_df[['SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']].notna()).groupby(['SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']).size())
SecondLargestPropertyUseType SecondLargestPropertyUseTypeGFA False False 683 True True 659 dtype: int64
ThirdLargestPropertyUseType ThirdLargestPropertyUseTypeGFA False False 1090 True True 252 dtype: int64
SecondLargestPropertyUseTypeGFA ThirdLargestPropertyUseTypeGFA
False False 683
True False 407
True 252
dtype: int64
There is either a value in both or missing values in both. We will proceed to filling the missing values.
building_index = building_df[(building_df['SecondLargestPropertyUseTypeGFA'] == 0)
& (building_df['SecondLargestPropertyUseType'].notna())].index
building_df.loc[building_index, ('SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA')] = np.nan
print('Number of building treated for having SecondLargestPropertyUseTypeGFA equal 0:', building_index.shape[0])
building_index = building_df[(building_df['ThirdLargestPropertyUseTypeGFA'] == 0)
& (building_df['ThirdLargestPropertyUseType'].notna())].index
building_df.loc[building_index, ('ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA')] = np.nan
print('Number of building treated for having ThirdLargestPropertyUseTypeGFA equal 0:', building_index.shape[0])
Number of building treated for having SecondLargestPropertyUseTypeGFA equal 0: 65 Number of building treated for having ThirdLargestPropertyUseTypeGFA equal 0: 16
building_index = building_df[(building_df['PropertyGFATotal']
< building_df['LargestPropertyUseTypeGFA'])].index
print('Number of building having LargestPropertyUseTypeGFA > PropertyGFATotal:',
building_index.shape[0])
building_df.loc[building_index, 'LargestPropertyUseTypeGFA'] = building_df.loc[building_index,
'PropertyGFATotal']
Number of building having LargestPropertyUseTypeGFA > PropertyGFATotal: 167
print((building_df['SiteEnergyUse(kBtu)'] / (building_df['Electricity(kBtu)']
+ building_df['SteamUse(kBtu)']
+ building_df['NaturalGas(kBtu)'])).describe())
count 1341.0 mean inf std NaN min 0.0 25% 1.0 50% 1.0 75% 1.0 max inf dtype: float64
- We see that the minimum value of the factor equal to 0 which means that SiteEnergyUse is 0 and at least one of electricity, natural gas, and steam is not 0.
- In that case when SiteEnergyUse(kBtu) is smaller than the sum, we will replace the value of SiteEnergyUse(kBtu) with the sum of electricity, natural gas, and steam.
- We see the maximum value of the factor going to infinity which means that electricity, natural gas, and steam all equal to 0.
- We will check the buildings where all SiteEnergyUse(kBtu), electricity, natural gas, and steam all equal to 0 and remove them from the dataset.
- For the other buildings where SiteEnergyUse(kBtu) is bigger than the sum of electricity, natural gas, and steam, this can be due to the presence of another source of consumed energy. We will create another column named OtherFuel(kBtu) with the value equal to SiteEnergyUse(kBtu)-(electricity+natural gas+steam)
building_index = building_df[(building_df['SiteEnergyUse(kBtu)']
< (building_df['Electricity(kBtu)']
+ building_df['SteamUse(kBtu)']
+ building_df['NaturalGas(kBtu)']))].index
for i in building_index:
building_df.loc[i, 'SiteEnergyUse(kBtu)'] = building_df.loc[i, ['Electricity(kBtu)',
'SteamUse(kBtu)',
'NaturalGas(kBtu)']].sum()
print('Total number of buildings to drop where all energy equal to 0 equal to:',
building_df[(building_df[['SiteEnergyUse(kBtu)', 'Electricity(kBtu)',
'SteamUse(kBtu)', 'NaturalGas(kBtu)']].any(axis=1)) == 0].shape[0])
building_df.drop(building_df[(building_df[['SiteEnergyUse(kBtu)', 'Electricity(kBtu)',
'SteamUse(kBtu)', 'NaturalGas(kBtu)']].any(axis=1))
== 0].index, inplace=True)
Total number of buildings to drop where all energy equal to 0 equal to: 1
# Creation of new column OtherFuel(kBtu)
indexing = building_df.index
building_df['OtherFuel(kBtu)'] = np.nan
for i in building_df.index:
building_df.loc[i, 'OtherFuel(kBtu)'] = (building_df.loc[i, 'SiteEnergyUse(kBtu)']
- building_df.loc[i, ['Electricity(kBtu)',
'SteamUse(kBtu)',
'NaturalGas(kBtu)']].sum())
print('Percentage of NaN in the dataset equal to',
sum(building_df.isna().sum()/building_df.shape[0]*100)/building_df.shape[1])
building_df = building_df.reset_index(drop=True)
df_description(building_df)
Percentage of NaN in the dataset equal to 9.441167830429578
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | ZipCode | CouncilDistrictCode | Neighborhood | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | OtherFuel(kBtu) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| type | int64 | int64 | object | object | object | float64 | int64 | object | float64 | float64 | int64 | float64 | float64 | int64 | int64 | int64 | object | object | float64 | object | float64 | object | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 |
| number null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 747.0 | 747.0 | 1105.0 | 1105.0 | 474.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| percentage null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 55.704698 | 55.704698 | 82.401193 | 82.401193 | 35.346756 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| count | 1341.0 | 1341.0 | 1341 | 1341 | 1341 | 1341.0 | 1341.0 | 1341 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341 | 1341 | 1341.0 | 594 | 594.0 | 236 | 236.0 | 867.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 |
| unique | NaN | NaN | 4 | 16 | 1338 | NaN | NaN | 19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 281 | 22 | NaN | 43 | NaN | 33 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | NaN | NaN | NonResidential | Small- and Mid-Sized Office | Airport Way | NaN | NaN | GREATER DUWAMISH | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Office | Office | NaN | Parking | NaN | Office | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | NaN | NaN | 1170 | 289 | 2 | NaN | NaN | 322 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 129 | 415 | NaN | 152 | NaN | 39 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | 17192.407905 | 2016.0 | NaN | NaN | NaN | 98117.511559 | 4.155854 | NaN | 47.614561 | -122.333151 | 1959.66965 | 1.054437 | 2.788963 | 65204.343028 | 6778.224459 | 58426.118568 | NaN | NaN | 52921.695004 | NaN | 18631.030971 | NaN | 7532.454661 | 65.268743 | 59.480686 | 148.230649 | 3392866.183954 | 81667.36371 | 2451620.760013 | 844129.878477 | 68.226734 | 1.242177 | 15448.181754 |
| std | 12813.579527 | 0.0 | NaN | NaN | NaN | 18.003629 | 2.180378 | NaN | 0.050503 | 0.025648 | 32.150656 | 0.407967 | 2.443278 | 59120.21036 | 23798.419674 | 47574.437729 | NaN | NaN | 45749.688583 | NaN | 21551.44345 | NaN | 7737.210878 | 28.105098 | 46.91207 | 120.258648 | 3531540.012045 | 507775.349524 | 2951370.957907 | 1223941.285058 | 81.680099 | 1.370458 | 358531.100048 |
| min | 9.0 | 2016.0 | NaN | NaN | NaN | 98006.0 | 1.0 | NaN | 47.49917 | -122.41182 | 1900.0 | 1.0 | 1.0 | 11285.0 | 0.0 | 10925.0 | NaN | NaN | 5656.0 | NaN | 200.0 | NaN | 182.0 | 1.0 | 0.0 | 0.0 | 57133.19922 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 25% | 707.0 | 2016.0 | NaN | NaN | NaN | 98105.0 | 2.0 | NaN | 47.57842 | -122.34539 | 1929.0 | 1.0 | 1.0 | 27800.0 | 0.0 | 26830.0 | NaN | NaN | 23751.0 | NaN | 6485.25 | NaN | 2968.75 | 49.0 | 31.0 | 69.699997 | 1108311.0 | 0.0 | 642420.0 | 0.0 | 17.08 | 0.33 | 0.0 |
| 50% | 21425.0 | 2016.0 | NaN | NaN | NaN | 98115.0 | 4.0 | NaN | 47.61118 | -122.33248 | 1963.0 | 1.0 | 2.0 | 43183.0 | 0.0 | 41221.0 | NaN | NaN | 36444.0 | NaN | 10766.0 | NaN | 5348.5 | 73.0 | 47.5 | 119.0 | 2042771.0 | 0.0 | 1293349.0 | 390854.0 | 39.05 | 0.79 | 0.25 |
| 75% | 24629.0 | 2016.0 | NaN | NaN | NaN | 98125.0 | 7.0 | NaN | 47.65097 | -122.32111 | 1986.0 | 1.0 | 3.0 | 75600.0 | 0.0 | 67755.0 | NaN | NaN | 63000.0 | NaN | 20938.5 | NaN | 9590.75 | 88.0 | 73.199997 | 185.600006 | 4323672.0 | 0.0 | 3020001.0 | 1073829.0 | 83.45 | 1.58 | 0.5 |
| max | 50226.0 | 2016.0 | NaN | NaN | NaN | 98199.0 | 7.0 | NaN | 47.73387 | -122.25864 | 2015.0 | 6.0 | 27.0 | 378002.0 | 272900.0 | 323916.0 | NaN | NaN | 289588.0 | NaN | 139000.0 | NaN | 58152.0 | 100.0 | 335.5 | 781.299988 | 19295226.0 | 7413886.0 | 19036413.0 | 7116213.0 | 639.72 | 10.54 | 12525174.0 |
There is missing values in these columns: SecondLargestPropertyUseType, SecondLargestPropertyUseTypeGFA, ThirdLargestPropertyUseType, ThirdLargestPropertyUseTypeGFA, and ENERGYSTARScore. We will not treat them any further. The SecondLargestPropertyUseType, SecondLargestPropertyUseTypeGFA, ThirdLargestPropertyUseType, ThirdLargestPropertyUseTypeGFA will be replaced by another columns as we will see in the feature engineering. For ENERGYSTARScore, we will leave it as it is.
building_df['BuildingAge'] = building_df['DataYear'] - building_df['YearBuilt']
building_df['BuildingDecade'] = building_df['YearBuilt']//10
building_df = building_df.drop(['DataYear', 'YearBuilt'], axis=1)
AllPropertyUseType_list = building_df[['LargestPropertyUseType',
'SecondLargestPropertyUseType',
'ThirdLargestPropertyUseType']].values
# List of all unique type in the three PropertyUseType columns
AllPropertyUseType_list = list(set(AllPropertyUseType_list.reshape((-1,))))
AllPropertyUseType_list = sorted([x for x in AllPropertyUseType_list if pd.notnull(x)])
# Creating another list of type in LargestPropertyUseType
LargestPropertyUseType_list = building_df['LargestPropertyUseType'].values
# List of all unique type in LargestPropertyUseType
LargestPropertyUseType_list = list(set(LargestPropertyUseType_list.reshape((-1,))))
LargestPropertyUseType_list = sorted([x for x in LargestPropertyUseType_list if pd.notnull(x)])
# Creating the GFA columns with value equal to 0
for column in AllPropertyUseType_list:
building_df[column+'_GFAratio'] = 0
# Filling the columns with the GFA information in LargestGFA, SecondLargestGFA, and ThirdLargestGFA:
for i in range(0, len(building_df)) :
if pd.notnull(building_df['LargestPropertyUseTypeGFA'].iloc[i]):
building_df.loc[i,building_df['LargestPropertyUseType'].iloc[i]+'_GFAratio'] += building_df['LargestPropertyUseTypeGFA'].iloc[i]/building_df['PropertyGFATotal'].iloc[i]
if pd.notnull(building_df['SecondLargestPropertyUseTypeGFA'].iloc[i]):
building_df.loc[i,building_df['SecondLargestPropertyUseType'].iloc[i]+'_GFAratio'] += building_df['SecondLargestPropertyUseTypeGFA'].iloc[i]/building_df['PropertyGFATotal'].iloc[i]
if pd.notnull(building_df['ThirdLargestPropertyUseTypeGFA'].iloc[i]):
building_df.loc[i,building_df['ThirdLargestPropertyUseType'].iloc[i]+'_GFAratio'] += building_df['ThirdLargestPropertyUseTypeGFA'].iloc[i]/building_df['PropertyGFATotal'].iloc[i]
# Removing all created columns with less than 10 entries different than 0
for column in AllPropertyUseType_list:
if ((building_df[building_df[column+'_GFAratio'] != 0].shape[0] < 10)
& (column not in LargestPropertyUseType_list)):
building_df = building_df.drop([column+'_GFAratio'], axis='columns')
building_df.shape
(1341, 59)
building_df = building_df.reset_index(drop=True)
building_df['PropertyUseTypeCount'] = 0
for i in range(0, len(building_df)) :
building_df.loc[i, 'PropertyUseTypeCount'] = ((building_df.iloc[i][['LargestPropertyUseTypeGFA',
'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']] != 0)
& (building_df.iloc[i][['LargestPropertyUseTypeGFA',
'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']].notna())).sum()
building_df['MainEnergySource'] = building_df[['Electricity(kBtu)',
'SteamUse(kBtu)',
'NaturalGas(kBtu)']].idxmax(axis=1)
print(building_df.groupby(by='MainEnergySource').nunique().sort_values('OSEBuildingID',
ascending=False)['OSEBuildingID'])
MainEnergySource Electricity(kBtu) 998 NaturalGas(kBtu) 330 SteamUse(kBtu) 13 Name: OSEBuildingID, dtype: int64
building_df = building_df.reset_index(drop=True)
building_df['EnergySourceCount'] = 0
for i in range(0, len(building_df)) :
building_df.loc[i, 'EnergySourceCount'] = (building_df.iloc[i][['Electricity(kBtu)',
'SteamUse(kBtu)',
'NaturalGas(kBtu)']] != 0).sum()
building_df = building_df.reset_index(drop=True)
df_description(building_df)
| OSEBuildingID | BuildingType | PrimaryPropertyType | PropertyName | ZipCode | CouncilDistrictCode | Neighborhood | Latitude | Longitude | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | OtherFuel(kBtu) | BuildingAge | BuildingDecade | Bank Branch_GFAratio | College/University_GFAratio | Data Center_GFAratio | Distribution Center_GFAratio | Fitness Center/Health Club/Gym_GFAratio | K-12 School_GFAratio | Manufacturing/Industrial Plant_GFAratio | Medical Office_GFAratio | Mixed Use Property_GFAratio | Multifamily Housing_GFAratio | Non-Refrigerated Warehouse_GFAratio | Office_GFAratio | Other_GFAratio | Other - Entertainment/Public Assembly_GFAratio | Other - Recreation_GFAratio | Other - Services_GFAratio | Parking_GFAratio | Refrigerated Warehouse_GFAratio | Repair Services (Vehicle, Shoe, Locksmith, etc)_GFAratio | Restaurant_GFAratio | Retail Store_GFAratio | Self-Storage Facility_GFAratio | Senior Care Community_GFAratio | Social/Meeting Hall_GFAratio | Supermarket/Grocery Store_GFAratio | Worship Facility_GFAratio | PropertyUseTypeCount | MainEnergySource | EnergySourceCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| type | int64 | object | object | object | float64 | int64 | object | float64 | float64 | float64 | float64 | int64 | int64 | int64 | object | object | float64 | object | float64 | object | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | int64 | int64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | int64 | object | int64 |
| number null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 747.0 | 747.0 | 1105.0 | 1105.0 | 474.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| percentage null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 55.704698 | 55.704698 | 82.401193 | 82.401193 | 35.346756 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| count | 1341.0 | 1341 | 1341 | 1341 | 1341.0 | 1341.0 | 1341 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341 | 1341 | 1341.0 | 594 | 594.0 | 236 | 236.0 | 867.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341 | 1341.0 |
| unique | NaN | 4 | 16 | 1338 | NaN | NaN | 19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 281 | 22 | NaN | 43 | NaN | 33 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3 | NaN |
| top | NaN | NonResidential | Small- and Mid-Sized Office | Airport Way | NaN | NaN | GREATER DUWAMISH | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Office | Office | NaN | Parking | NaN | Office | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Electricity(kBtu) | NaN |
| freq | NaN | 1170 | 289 | 2 | NaN | NaN | 322 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 129 | 415 | NaN | 152 | NaN | 39 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 998 | NaN |
| mean | 17192.407905 | NaN | NaN | NaN | 98117.511559 | 4.155854 | NaN | 47.614561 | -122.333151 | 1.054437 | 2.788963 | 65204.343028 | 6778.224459 | 58426.118568 | NaN | NaN | 52921.695004 | NaN | 18631.030971 | NaN | 7532.454661 | 65.268743 | 59.480686 | 148.230649 | 3392866.183954 | 81667.36371 | 2451620.760013 | 844129.878477 | 68.226734 | 1.242177 | 15448.181754 | 56.33035 | 195.536167 | 0.001136 | 0.010766 | 0.000603 | 0.036152 | 0.003098 | 0.097992 | 0.005331 | 0.019926 | 0.002829 | 0.006166 | 0.134384 | 0.274615 | 0.101099 | 0.01212 | 0.01327 | 0.002146 | 0.043729 | 0.008388 | 0.00492 | 0.010531 | 0.073846 | 0.019778 | 0.008679 | 0.006267 | 0.023417 | 0.049998 | 1.618941 | NaN | 1.741238 |
| std | 12813.579527 | NaN | NaN | NaN | 18.003629 | 2.180378 | NaN | 0.050503 | 0.025648 | 0.407967 | 2.443278 | 59120.21036 | 23798.419674 | 47574.437729 | NaN | NaN | 45749.688583 | NaN | 21551.44345 | NaN | 7737.210878 | 28.105098 | 46.91207 | 120.258648 | 3531540.012045 | 507775.349524 | 2951370.957907 | 1223941.285058 | 81.680099 | 1.370458 | 358531.100048 | 32.150656 | 3.188072 | 0.014974 | 0.10179 | 0.009205 | 0.174062 | 0.033733 | 0.294543 | 0.068781 | 0.124538 | 0.033834 | 0.046094 | 0.314654 | 0.374637 | 0.276869 | 0.099387 | 0.102131 | 0.025985 | 0.12683 | 0.086722 | 0.063918 | 0.079024 | 0.224152 | 0.135193 | 0.088001 | 0.068601 | 0.141105 | 0.213572 | 0.766986 | NaN | 0.486542 |
| min | 9.0 | NaN | NaN | NaN | 98006.0 | 1.0 | NaN | 47.49917 | -122.41182 | 1.0 | 1.0 | 11285.0 | 0.0 | 10925.0 | NaN | NaN | 5656.0 | NaN | 200.0 | NaN | 182.0 | 1.0 | 0.0 | 0.0 | 57133.19922 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 190.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 0.0 |
| 25% | 707.0 | NaN | NaN | NaN | 98105.0 | 2.0 | NaN | 47.57842 | -122.34539 | 1.0 | 1.0 | 27800.0 | 0.0 | 26830.0 | NaN | NaN | 23751.0 | NaN | 6485.25 | NaN | 2968.75 | 49.0 | 31.0 | 69.699997 | 1108311.0 | 0.0 | 642420.0 | 0.0 | 17.08 | 0.33 | 0.0 | 30.0 | 192.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 1.0 |
| 50% | 21425.0 | NaN | NaN | NaN | 98115.0 | 4.0 | NaN | 47.61118 | -122.33248 | 1.0 | 2.0 | 43183.0 | 0.0 | 41221.0 | NaN | NaN | 36444.0 | NaN | 10766.0 | NaN | 5348.5 | 73.0 | 47.5 | 119.0 | 2042771.0 | 0.0 | 1293349.0 | 390854.0 | 39.05 | 0.79 | 0.25 | 53.0 | 196.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 2.0 |
| 75% | 24629.0 | NaN | NaN | NaN | 98125.0 | 7.0 | NaN | 47.65097 | -122.32111 | 1.0 | 3.0 | 75600.0 | 0.0 | 67755.0 | NaN | NaN | 63000.0 | NaN | 20938.5 | NaN | 9590.75 | 88.0 | 73.199997 | 185.600006 | 4323672.0 | 0.0 | 3020001.0 | 1073829.0 | 83.45 | 1.58 | 0.5 | 87.0 | 198.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.587782 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | NaN | 2.0 |
| max | 50226.0 | NaN | NaN | NaN | 98199.0 | 7.0 | NaN | 47.73387 | -122.25864 | 6.0 | 27.0 | 378002.0 | 272900.0 | 323916.0 | NaN | NaN | 289588.0 | NaN | 139000.0 | NaN | 58152.0 | 100.0 | 335.5 | 781.299988 | 19295226.0 | 7413886.0 | 19036413.0 | 7116213.0 | 639.72 | 10.54 | 12525174.0 | 116.0 | 201.0 | 0.270421 | 1.0 | 0.266239 | 1.0 | 0.461031 | 1.0 | 1.0 | 1.0 | 0.548901 | 0.528618 | 1.0 | 2.452054 | 1.0 | 1.0 | 1.0 | 0.495667 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | NaN | 3.0 |
sns.histplot(data=building_df[(building_df['SiteEnergyUse(kBtu)'] < 2e7)],
x='SiteEnergyUse(kBtu)', kde=True, bins=100)
plt.show()
sns.histplot(data=building_df[(building_df['TotalGHGEmissions'] < 200)],
x='TotalGHGEmissions', kde=True, bins=100)
plt.xlabel('TotalGHGEmissions(tCO2e)')
plt.show()
We see that both targets does not follow a normal distribution which can a problem for the linear regression models. We will test with and without log transformation of the target to check the error and score of the models.
plt.plot(building_df['TotalGHGEmissions'], building_df['SiteEnergyUse(kBtu)'], 'o')
plt.ylabel('SiteEnergyUse(kBtu)')
plt.xlabel('TotalGHGEmissions(tCO2e)')
plt.show()
We see correlation between the two targets which is linked to the energy sources (electricity, steam and naturalGas).
columns_list = ['Electricity(kBtu)','SteamUse(kBtu)','NaturalGas(kBtu)']
n_sub = len(columns_list)
n_col = 3
fig, axes = plt.subplots(int(round((0.49 + n_sub/n_col), 0)),
n_col, figsize=(20, int(1*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
sns.histplot(data=building_df, x=columns_list[i], ax=ax)
i += 1
plt.show()
plt.pie(building_df[['SteamUse(kBtu)', 'Electricity(kBtu)',
'NaturalGas(kBtu)']].mean(),
labels=building_df[['SteamUse(kBtu)', 'Electricity(kBtu)',
'NaturalGas(kBtu)']].mean().index,
startangle=0,
shadow=True,
autopct='%1.1f%%')
plt.title('Mean of Energy Source Consumption for All Buildings',
fontsize=12, fontweight='bold')
plt.show()
Electricity, NaruralGas and Steam show skewed distribution. We see also dominance of Electricity over NaruralGas and Steam.
columns_list = ['SiteEnergyUse(kBtu)', 'Electricity(kBtu)',
'SteamUse(kBtu)', 'NaturalGas(kBtu)']
n_sub = len(columns_list)
n_col = 4
fig, axes = plt.subplots(int(round((0.49 + n_sub/n_col), 0)),
n_col, figsize=(20, int(1*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
ax.plot(building_df['TotalGHGEmissions'],
building_df[columns_list[i]], 'o')
ax.set_xlabel('TotalGHGEmissions(tCO2e)')
ax.set_ylabel(columns_list[i])
i += 1
plt.show()
Looking at the plots, we see different correlation values for each source of energy with TotalGHGEmissions as expected.
print((building_df['SiteEUI(kBtu/sf)'] / building_df['SourceEUI(kBtu/sf)']).describe())
count 1330.000000 mean inf std NaN min 0.072889 25% 0.318780 50% 0.389706 75% 0.479677 max inf dtype: float64
Not so efficient, Seattle loose on average 60% of the energy due to transmitting from source to site.
building_df = building_df.drop(['SourceEUI(kBtu/sf)'], axis=1)
colormap = cm.LinearColormap(colors=['green', 'orange', 'red'], vmin=0,
vmax=building_df['SiteEnergyUse(kBtu)'].median()*5)
Seattle_Map = folium.Map(location=[building_df['Latitude'].mean(),
building_df['Longitude'].mean()], zoom_start=11)
lat = list(building_df['Latitude'])
lon = list(building_df['Longitude'])
color_pow = list(building_df['SiteEnergyUse(kBtu)'])
for loc, p in zip(zip(building_df.Latitude, building_df.Longitude), color_pow):
folium.Circle(location=loc, radius=5, color=colormap(p)).add_to(Seattle_Map)
Seattle_Map.add_child(colormap)
Seattle_Map
We see no effect of location on the energy usage of the building.
colormap = cm.LinearColormap(colors=['green', 'orange', 'red'], vmin=0,
vmax=building_df['TotalGHGEmissions'].median()*5)
Seattle_Map = folium.Map(location=[building_df['Latitude'].mean(),
building_df['Longitude'].mean()], zoom_start=11)
lat = list(building_df['Latitude'])
lon = list(building_df['Longitude'])
color_pow = list(building_df['TotalGHGEmissions'])
for loc, p in zip(zip(building_df.Latitude, building_df.Longitude), color_pow):
folium.Circle(location=loc, radius=5, color=colormap(p)).add_to(Seattle_Map)
Seattle_Map.add_child(colormap)
Seattle_Map
Similar to SiteEnergyUse, we see no effect of the building location on the GHG emission.
building_df = building_df.drop(['Latitude', 'Longitude'], axis=1)
columns_list = ['PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']
n_sub = len(columns_list)
n_col = 3
fig, axes = plt.subplots(int(round((0.49 + n_sub/n_col), 0)),
n_col, figsize=(20, int(1.5*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
sns.histplot(data=building_df, x=columns_list[i], ax=ax)
i += 1
plt.show()
We see that all distributions are skewed, we will use PowerTransformer as an option to log scale the distribution before the ML prediction.
columns_list = ['PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']
n_sub = len(columns_list)
n_col = 3
fig, axes = plt.subplots(int(round((0.49 + n_sub/n_col), 0)),
n_col, figsize=(20, int(2*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
ax.plot(building_df['SiteEnergyUse(kBtu)'],
building_df[columns_list[i]], 'o')
ax.set_xlabel('SiteEnergyUse(kBtu)')
ax.set_ylabel(columns_list[i])
i += 1
plt.show()
We see correlation between GFA columns with SiteEnergyUse. The bigger the GFA of the building the larger the energy consumption.
columns_list = ['PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA']
n_sub = len(columns_list)
n_col = 3
fig, axes = plt.subplots(int(round((0.49 + n_sub/n_col), 0)),
n_col, figsize=(20, int(2*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
ax.plot(building_df['TotalGHGEmissions'],
building_df[columns_list[i]], 'o')
ax.set_xlabel('TotalGHGEmissions')
ax.set_ylabel(columns_list[i])
i += 1
plt.show()
We see correlation between GFA columns with TotalGHGEmissions as well. The bigger the GFA of the building the higher the GHG emissions.
plt.figure(figsize=(8, 2))
sns.histplot(data=building_df.sort_values('BuildingAge'), x='BuildingAge', binwidth=5)
plt.show()
columns_list = ['SiteEnergyUse(kBtu)', 'TotalGHGEmissions']
n_sub = len(columns_list)
n_col = 2
fig, axes = plt.subplots(int(round((0.49 + n_sub/n_col), 0)),
n_col, figsize=(10, int(2*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
ax.plot(building_df['BuildingAge'],
building_df[columns_list[i]], 'o')
ax.set_xlabel('BuildingAge')
ax.set_ylabel(columns_list[i])
i += 1
plt.show()
No much correlation of the building age with SiteEnergyUse(kBtu) and TotalGHGEmissions.
We will check building Decade since it's much easier to see small effect using building Decade over age.
columns_list=['SiteEnergyUse(kBtu)', 'SiteEUI(kBtu/sf)',
'TotalGHGEmissions', 'GHGEmissionsIntensity']
n_sub = len(columns_list)
n_col = 2
fig, axes = plt.subplots(int(round((0.49+n_sub/n_col), 0)),
n_col, figsize=(15, int(2.5*n_sub)))
axes = np.array(axes)
i = 0
BuildingDecade_list = building_df.groupby(by='BuildingDecade').nunique().sort_values('BuildingDecade',
ascending=True).index
for ax in axes.reshape(-1):
groupes = []
for m in BuildingDecade_list:
groupes.append(building_df[building_df['BuildingDecade'] == m][columns_list[i]])
ax.boxplot(groupes, labels=BuildingDecade_list, showfliers=False,
vert=False, patch_artist=True, showmeans=True)
ax.set_title('Distribution of {}'.format(columns_list[i]))
ax.set_xlabel(format(columns_list[i]))
ax.set_ylabel(format('BuildingDecade'))
i += 1
plt.show()
Small correlation of the building Decade with SiteEnergyUse(kBtu) but not with TotalGHGEmissions. However looking at the energy usage intensity column SiteEUI, the correlation disappear meaning that bigger GFA are built now a day than in the past which explain the higher energy usage of newer buildings. Thus the correlation of age is more linked to GFA.
ENERGY STAR score is a metric that compares a building’s energy use to other U.S. buildings on a scale from 1 to 100, where 1 is least efficient and 100 is most efficient.
plt.figure(figsize=(8, 2))
sns.histplot(data=building_df.sort_values('ENERGYSTARScore'),
x='ENERGYSTARScore', binwidth=5)
plt.show()
We see that the majority of buildings in Seattle are energy efficient.
columns_list = ['SiteEnergyUse(kBtu)', 'TotalGHGEmissions']
n_sub = len(columns_list)
n_col = 2
fig, axes = plt.subplots(int(round((0.49 + n_sub/n_col), 0)),
n_col, figsize=(10, int(2*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
ax.plot(building_df['ENERGYSTARScore'],
building_df[columns_list[i]], 'o')
ax.set_xlabel('ENERGYSTARScore')
ax.set_ylabel(columns_list[i])
i += 1
plt.show()
Low correlation of the ENERGYSTARScore with SiteEnergyUse(kBtu) and TotalGHGEmissions.
columns_list = ['NumberofBuildings', 'NumberofFloors']
n_sub = len(columns_list)
n_col = 2
fig, axes = plt.subplots(int(round((0.49+n_sub/n_col), 0)),
n_col, figsize=(15, int(2*n_sub)))
axes = np.array(axes)
i = 0
for ax in axes.reshape(-1):
sns.histplot(data=building_df[columns_list[i]], ax=ax)
ax.set_title('Distribution of {}'.format(columns_list[i]))
i += 1
plt.show()
NumberofBuildings is mostly 1, we will not use it for the prediction.
We check the correlation of number of Floors with SiteEnergyUse and TotalGHGEmissions:
columns_list=['SiteEnergyUse(kBtu)', 'SiteEUI(kBtu/sf)',
'TotalGHGEmissions', 'GHGEmissionsIntensity']
n_sub = len(columns_list)
n_col = 2
fig, axes = plt.subplots(int(round((0.49+n_sub/n_col), 0)),
n_col, figsize=(15, int(2.5*n_sub)))
axes = np.array(axes)
i = 0
BuildingDecade_list = building_df.groupby(by='NumberofFloors').nunique().head(10).sort_values('NumberofFloors',
ascending=True).index
for ax in axes.reshape(-1):
groupes = []
for m in BuildingDecade_list:
groupes.append(building_df[building_df['NumberofFloors'] == m][columns_list[i]])
ax.boxplot(groupes, labels=BuildingDecade_list, showfliers=False,
vert=False, patch_artist=True, showmeans=True)
ax.set_title('Distribution of {}'.format(columns_list[i]))
ax.set_xlabel(format(columns_list[i]))
ax.set_ylabel(format('NumberofFloors'))
i += 1
plt.show()
Similar observation to BuildingDecade, the NumberofFloors show correlation with SiteEnergyUse(kBtu), TotalGHGEmissions but not to SiteEUI(kBtu/sf) and GHGEmissionsIntensity is due to the direct link of GFA with the number of floors in the building.
columns_list=['SiteEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'TotalGHGEmissions',
'GHGEmissionsIntensity', 'BuildingAge', 'NumberofFloors',
'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA', 'Electricity(kBtu)',
'SteamUse(kBtu)', 'NaturalGas(kBtu)', 'ENERGYSTARScore']
plt.figure(figsize=(12, 5))
sns.heatmap(building_df[columns_list].corr(method='pearson'),
vmin=-1, vmax=1, annot=True, cmap='coolwarm', fmt='0.2f')
plt.xticks(ha='right', rotation=45, rotation_mode='anchor')
plt.show()
Final conclusion on numerical columns:
We see high correlation of SiteEnergyUse with PropertyGFATotal, PropertyGFABuilding, LargestPropertyUseTypeGFA, SecondLargestPropertyUseTypeGFA, SiteEUI, Electricity, NaturalGas, TotalGHGEmissions.
We see high correlation of TotalGHGEmissions with PropertyGFATotal, PropertyGFABuilding, LargestPropertyUseTypeGFA, SiteEUI, SiteEnergyUse, Steam, Electricity, NaturalGas, GHGEmissionsIntensity, and EnergySourceCounts.
For the categorical columns, they will be analysed and selected below.
# Function that plot the distributions of some numerical columns for each categorcical columns
def distribution_of_num_for_each_categ(df, columns_list, cat_columns):
n_sub = len(columns_list)
n_col = 2
for cat_column in cat_columns:
fig, axes = plt.subplots(int(round((0.49+n_sub/n_col), 0)),
n_col, figsize=(20, int(2*n_sub)))
axes = np.array(axes)
i = 0
top_BuildingType_list = df.groupby(by=cat_column).nunique().sort_values('OSEBuildingID',
ascending=False).head(10).index
for ax in axes.reshape(-1):
groupes = []
for m in top_BuildingType_list:
groupes.append(df[df[cat_column] == m][columns_list[i]])
ax.boxplot(groupes, labels=top_BuildingType_list, showfliers=False,
vert=False, patch_artist=True, showmeans=True)
ax.set_title('Distribution of {}'.format(columns_list[i]))
ax.set_xlabel(format(columns_list[i]))
if i%n_col == 0:
ax.set_ylabel(format(cat_column))
i += 1
plt.show()
# Plotting the distributions of some quantitative columns for each BuildingType
columns_list=['SiteEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)',
'TotalGHGEmissions', 'GHGEmissionsIntensity']
cat_columns = ['ZipCode', 'CouncilDistrictCode', 'Neighborhood',
'BuildingType', 'PrimaryPropertyType',
'LargestPropertyUseType', 'SecondLargestPropertyUseType',
'ThirdLargestPropertyUseType',
'PropertyUseTypeCount', 'MainEnergySource',
'EnergySourceCount', 'BuildingDecade']
distribution_of_num_for_each_categ(building_df, columns_list, cat_columns)
# Defining Kruskal-Wallis H-test
def Kruskal_Wallis_test(df, columns_list, cat_columns):
Kruskal_Wallis_test_df = pd.DataFrame()
# Looping on the categorical column for the test
for cat_column in cat_columns:
# Defining lists for the test
categorical_columns = []
numerical_columns = []
H_statistic = []
p_value = []
df_names = []
categorical_columns.append(cat_column)
# Performing the Kruskal-Wallis test on each numerical column
for col in columns_list:
# Selecting the non null data
df_nonull = df[(df[[col, cat_column]].notna()).all(axis=1)]
# Calculating H_statistic and p_value
H, p = st.kruskal(*[df_nonull.loc[df_nonull[cat_column] == grp,col].values for grp in df_nonull[cat_column].unique()])
# Appending H_statistic and p_value for future comparaison
numerical_columns.append(col)
H_statistic.append(H)
p_value.append(p)
name = 'Kruskal_Wallis_test_df_' + cat_column
# Creating a dataframe with the H_statistic and p_value for each numerical column
name = pd.DataFrame([H_statistic,p_value], index=['H_statistic', 'p_value'])
name.columns = numerical_columns
name.insert(0, 'CategoricalColumn', [cat_column, cat_column], allow_duplicates=False)
# Concatunate together all dataframes corresponding to the categorical columns
Kruskal_Wallis_test_df = pd.concat([Kruskal_Wallis_test_df, name])
return Kruskal_Wallis_test_df
Kruskal_Wallis_test(building_df, columns_list, cat_columns)
| CategoricalColumn | SiteEUI(kBtu/sf) | SiteEnergyUse(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|
| H_statistic | ZipCode | 1.074557e+02 | 1.155928e+02 | 9.406672e+01 | 8.551376e+01 |
| p_value | ZipCode | 7.269289e-08 | 4.976453e-09 | 4.734998e-06 | 5.670074e-05 |
| H_statistic | CouncilDistrictCode | 4.333137e+01 | 3.865240e+01 | 2.210153e+01 | 3.416797e+01 |
| p_value | CouncilDistrictCode | 1.002914e-07 | 8.372799e-07 | 1.160629e-03 | 6.243138e-06 |
| H_statistic | Neighborhood | 6.140892e+01 | 6.641882e+01 | 3.903896e+01 | 5.873441e+01 |
| p_value | Neighborhood | 1.208865e-06 | 1.805731e-07 | 2.815525e-03 | 3.271380e-06 |
| H_statistic | BuildingType | 7.148699e+01 | 1.065668e+01 | 3.631487e+01 | 4.063549e+01 |
| p_value | BuildingType | 2.050235e-15 | 1.373489e-02 | 6.424059e-08 | 7.813149e-09 |
| H_statistic | PrimaryPropertyType | 4.274045e+02 | 4.428183e+02 | 2.775320e+02 | 2.163131e+02 |
| p_value | PrimaryPropertyType | 1.189236e-81 | 6.725689e-85 | 2.558995e-50 | 1.009420e-37 |
| H_statistic | LargestPropertyUseType | 4.293762e+02 | 3.053010e+02 | 1.879924e+02 | 2.347652e+02 |
| p_value | LargestPropertyUseType | 7.577820e-78 | 2.649940e-52 | 8.201291e-29 | 4.619342e-38 |
| H_statistic | SecondLargestPropertyUseType | 7.388256e+01 | 1.013065e+02 | 4.200157e+01 | 6.480390e+01 |
| p_value | SecondLargestPropertyUseType | 1.718190e-03 | 8.267882e-07 | 4.709062e-01 | 1.347963e-02 |
| H_statistic | ThirdLargestPropertyUseType | 3.562570e+01 | 6.500480e+01 | 3.729042e+01 | 3.156975e+01 |
| p_value | ThirdLargestPropertyUseType | 3.015860e-01 | 4.986926e-04 | 2.387332e-01 | 4.882233e-01 |
| H_statistic | PropertyUseTypeCount | 3.070239e+01 | 2.803995e+01 | 4.847247e+00 | 4.132318e+00 |
| p_value | PropertyUseTypeCount | 2.153082e-07 | 8.150852e-07 | 8.859999e-02 | 1.266714e-01 |
| H_statistic | MainEnergySource | 2.526713e+00 | 5.613544e+00 | 1.818204e+02 | 3.722625e+02 |
| p_value | MainEnergySource | 2.827036e-01 | 6.039966e-02 | 3.297693e-40 | 1.459590e-81 |
| H_statistic | EnergySourceCount | 3.937969e+01 | 3.642654e+01 | 4.055901e+02 | 5.554936e+02 |
| p_value | EnergySourceCount | 1.442187e-08 | 6.084035e-08 | 1.362323e-87 | 4.478834e-120 |
| H_statistic | BuildingDecade | 4.361738e+01 | 9.666541e+01 | 3.749635e+01 | 4.773778e+01 |
| p_value | BuildingDecade | 8.476837e-06 | 8.148456e-16 | 9.511647e-05 | 1.589846e-06 |
Kruskal_Wallis_test shows that all p_value are below 5% rejecting H0 hypothesis and showing that there are effect on SiteEnergyUse(kBtu) and TotalGHGEmissions from the categories in 'BuildingType', 'PrimaryPropertyType', 'ZipCode', 'CouncilDistrictCode', 'Neighborhood', 'LargestPropertyUseType', 'SecondLargestPropertyUseType', 'ThirdLargestPropertyUseType','MainEnergySource', 'EnergySourceCount', 'PropertyUseTypeCount', and 'BuildingDecade'.
Now let's focus on what categorical columns to select for the prediction:
We will have two features set for the predictions, one without the columns ending with _GFAratio and one with columns ending with _GFAratio.
SiteEnergyUse:
set_1: without all columns ending with _GFAratio:
set_2: with all columns ending with _GFAratio:
TotalGHGEmissions:
set_1: without all columns ending with _GFAratio:
set_2: with all columns ending with _GFAratio:
The categorical columns will be OneHotEncoded. For the numerical column (except all columns ending with _GFAratio), we will test different scaling technique: Standard scaler, RobustScaler, MinMaxScaler, and PowerTransformer.
print('Percentage of NaN in the dataset equal to',
sum(building_df.isna().sum() / building_df.shape[0]*100) / building_df.shape[1])
df_description(building_df)
Percentage of NaN in the dataset equal to 5.280653193291119
| OSEBuildingID | BuildingType | PrimaryPropertyType | PropertyName | ZipCode | CouncilDistrictCode | Neighborhood | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | SiteEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | OtherFuel(kBtu) | BuildingAge | BuildingDecade | Bank Branch_GFAratio | College/University_GFAratio | Data Center_GFAratio | Distribution Center_GFAratio | Fitness Center/Health Club/Gym_GFAratio | K-12 School_GFAratio | Manufacturing/Industrial Plant_GFAratio | Medical Office_GFAratio | Mixed Use Property_GFAratio | Multifamily Housing_GFAratio | Non-Refrigerated Warehouse_GFAratio | Office_GFAratio | Other_GFAratio | Other - Entertainment/Public Assembly_GFAratio | Other - Recreation_GFAratio | Other - Services_GFAratio | Parking_GFAratio | Refrigerated Warehouse_GFAratio | Repair Services (Vehicle, Shoe, Locksmith, etc)_GFAratio | Restaurant_GFAratio | Retail Store_GFAratio | Self-Storage Facility_GFAratio | Senior Care Community_GFAratio | Social/Meeting Hall_GFAratio | Supermarket/Grocery Store_GFAratio | Worship Facility_GFAratio | PropertyUseTypeCount | MainEnergySource | EnergySourceCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| type | int64 | object | object | object | float64 | int64 | object | float64 | float64 | int64 | int64 | int64 | object | object | float64 | object | float64 | object | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | int64 | int64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | int64 | object | int64 |
| number null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 747.0 | 747.0 | 1105.0 | 1105.0 | 474.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| percentage null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 55.704698 | 55.704698 | 82.401193 | 82.401193 | 35.346756 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| count | 1341.0 | 1341 | 1341 | 1341 | 1341.0 | 1341.0 | 1341 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341 | 1341 | 1341.0 | 594 | 594.0 | 236 | 236.0 | 867.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341.0 | 1341 | 1341.0 |
| unique | NaN | 4 | 16 | 1338 | NaN | NaN | 19 | NaN | NaN | NaN | NaN | NaN | 281 | 22 | NaN | 43 | NaN | 33 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3 | NaN |
| top | NaN | NonResidential | Small- and Mid-Sized Office | Airport Way | NaN | NaN | GREATER DUWAMISH | NaN | NaN | NaN | NaN | NaN | Office | Office | NaN | Parking | NaN | Office | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Electricity(kBtu) | NaN |
| freq | NaN | 1170 | 289 | 2 | NaN | NaN | 322 | NaN | NaN | NaN | NaN | NaN | 129 | 415 | NaN | 152 | NaN | 39 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 998 | NaN |
| mean | 17192.407905 | NaN | NaN | NaN | 98117.511559 | 4.155854 | NaN | 1.054437 | 2.788963 | 65204.343028 | 6778.224459 | 58426.118568 | NaN | NaN | 52921.695004 | NaN | 18631.030971 | NaN | 7532.454661 | 65.268743 | 59.480686 | 3392866.183954 | 81667.36371 | 2451620.760013 | 844129.878477 | 68.226734 | 1.242177 | 15448.181754 | 56.33035 | 195.536167 | 0.001136 | 0.010766 | 0.000603 | 0.036152 | 0.003098 | 0.097992 | 0.005331 | 0.019926 | 0.002829 | 0.006166 | 0.134384 | 0.274615 | 0.101099 | 0.01212 | 0.01327 | 0.002146 | 0.043729 | 0.008388 | 0.00492 | 0.010531 | 0.073846 | 0.019778 | 0.008679 | 0.006267 | 0.023417 | 0.049998 | 1.618941 | NaN | 1.741238 |
| std | 12813.579527 | NaN | NaN | NaN | 18.003629 | 2.180378 | NaN | 0.407967 | 2.443278 | 59120.21036 | 23798.419674 | 47574.437729 | NaN | NaN | 45749.688583 | NaN | 21551.44345 | NaN | 7737.210878 | 28.105098 | 46.91207 | 3531540.012045 | 507775.349524 | 2951370.957907 | 1223941.285058 | 81.680099 | 1.370458 | 358531.100048 | 32.150656 | 3.188072 | 0.014974 | 0.10179 | 0.009205 | 0.174062 | 0.033733 | 0.294543 | 0.068781 | 0.124538 | 0.033834 | 0.046094 | 0.314654 | 0.374637 | 0.276869 | 0.099387 | 0.102131 | 0.025985 | 0.12683 | 0.086722 | 0.063918 | 0.079024 | 0.224152 | 0.135193 | 0.088001 | 0.068601 | 0.141105 | 0.213572 | 0.766986 | NaN | 0.486542 |
| min | 9.0 | NaN | NaN | NaN | 98006.0 | 1.0 | NaN | 1.0 | 1.0 | 11285.0 | 0.0 | 10925.0 | NaN | NaN | 5656.0 | NaN | 200.0 | NaN | 182.0 | 1.0 | 0.0 | 57133.19922 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 190.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 0.0 |
| 25% | 707.0 | NaN | NaN | NaN | 98105.0 | 2.0 | NaN | 1.0 | 1.0 | 27800.0 | 0.0 | 26830.0 | NaN | NaN | 23751.0 | NaN | 6485.25 | NaN | 2968.75 | 49.0 | 31.0 | 1108311.0 | 0.0 | 642420.0 | 0.0 | 17.08 | 0.33 | 0.0 | 30.0 | 192.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 1.0 |
| 50% | 21425.0 | NaN | NaN | NaN | 98115.0 | 4.0 | NaN | 1.0 | 2.0 | 43183.0 | 0.0 | 41221.0 | NaN | NaN | 36444.0 | NaN | 10766.0 | NaN | 5348.5 | 73.0 | 47.5 | 2042771.0 | 0.0 | 1293349.0 | 390854.0 | 39.05 | 0.79 | 0.25 | 53.0 | 196.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 2.0 |
| 75% | 24629.0 | NaN | NaN | NaN | 98125.0 | 7.0 | NaN | 1.0 | 3.0 | 75600.0 | 0.0 | 67755.0 | NaN | NaN | 63000.0 | NaN | 20938.5 | NaN | 9590.75 | 88.0 | 73.199997 | 4323672.0 | 0.0 | 3020001.0 | 1073829.0 | 83.45 | 1.58 | 0.5 | 87.0 | 198.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.587782 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | NaN | 2.0 |
| max | 50226.0 | NaN | NaN | NaN | 98199.0 | 7.0 | NaN | 6.0 | 27.0 | 378002.0 | 272900.0 | 323916.0 | NaN | NaN | 289588.0 | NaN | 139000.0 | NaN | 58152.0 | 100.0 | 335.5 | 19295226.0 | 7413886.0 | 19036413.0 | 7116213.0 | 639.72 | 10.54 | 12525174.0 | 116.0 | 201.0 | 0.270421 | 1.0 | 0.266239 | 1.0 | 0.461031 | 1.0 | 1.0 | 1.0 | 0.548901 | 0.528618 | 1.0 | 2.452054 | 1.0 | 1.0 | 1.0 | 0.495667 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | NaN | 3.0 |
building_df = building_df.drop(['OSEBuildingID', 'PropertyName', 'CouncilDistrictCode',
'Neighborhood', 'NumberofBuildings', 'PropertyGFAParking',
'ListOfAllPropertyUseTypes', 'LargestPropertyUseType',
'PropertyName', 'SecondLargestPropertyUseType',
'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
'ThirdLargestPropertyUseTypeGFA', 'SiteEUI(kBtu/sf)',
'SteamUse(kBtu)', 'Electricity(kBtu)', 'NaturalGas(kBtu)',
'GHGEmissionsIntensity', 'OtherFuel(kBtu)'], axis=1)
print('Percentage of NaN in the dataset equal to',
sum(building_df.isna().sum() / building_df.shape[0]*100) / building_df.shape[1])
print('The final number of columns equal to', building_df.shape[1],
'out of the initial number of', initial_nbr_columns, 'columns.')
print('The final number of rows equal to', building_df.shape[0],
'corresponding to', building_df.shape[0] / initial_nbr_rows*100,
'% of the initial numbers of rows.')
building_df = building_df.reset_index(drop=True)
building_df.to_csv('building_df_cleaned.csv', sep='\t', index=False)
Percentage of NaN in the dataset equal to 0.8621160037103727 The final number of columns equal to 41 out of the initial number of 46 columns. The final number of rows equal to 1341 corresponding to 39.72156398104265 % of the initial numbers of rows.